Now we run this a second time, on the second (b) feature table that has removed all epithets with fewer than 27 representative documents. The results are better (overall F1 score for decision tree is 0.44, random forest is 0.47; in a these were 0.33 and 0.40, respectively).
In [1]:
import os
from sklearn import preprocessing
from sklearn.tree import DecisionTreeClassifier
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.externals import joblib
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import preprocessing
from sklearn.tree import DecisionTreeClassifier
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestClassifier
In [2]:
from sklearn import clone
from sklearn import preprocessing
from sklearn import svm
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.externals import joblib
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier
import datetime as dt
In [3]:
fp_df = os.path.expanduser('~/cltk_data/user_data/tlg_bow_df.pickle')
dataframe_bow = joblib.load(fp_df)
In [4]:
Y = dataframe_bow['epithet']
In [5]:
X = dataframe_bow.drop(['epithet', 'id', 'author'], 1)
In [6]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=0)
In [7]:
def scale_data(X_train, X_test, Y_train, Y_test):
"""Take Vectors,
"""
'''
-PREPOCESSING
-Here, scaled data has zero mean and unit varience
-We save the scaler to later use with testing/prediction data
'''
print('Scaling data ...')
t0 = dt.datetime.utcnow()
scaler = preprocessing.StandardScaler().fit(X_train)
fp_scaler = os.path.expanduser('~/cltk_data/user_data/tlg_bow_scaler.pickle')
joblib.dump(scaler, fp_scaler)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)
print('... finished in {} secs.'.format(dt.datetime.utcnow() - t0))
print()
return X_train_scaled, X_test_scaled, Y_train, Y_test
In [8]:
X_train_scaled, X_test_scaled, Y_train, Y_test = scale_data(X_train, X_test, Y_train, Y_test)
In [9]:
def run_tree(X_train_scaled, X_test_scaled, Y_train, Y_test):
"""Run decision tree with scikit.
Experiment with: 'max_depth'
"""
'''
-This is where we define the models with pre-defined parameters
-We can learn these parameters given our data
'''
print('Defining and fitting models ...')
t0 = dt.datetime.utcnow()
dec_tree = DecisionTreeClassifier()
dec_tree.fit(X_train_scaled, Y_train)
fp_model_pickle = os.path.expanduser('~/cltk_data/user_data/tlg_bow_dt.pickle')
joblib.dump(dec_tree, fp_model_pickle)
print('... finished in {} secs.'.format(dt.datetime.utcnow() - t0))
print()
Y_prediction_tree = dec_tree.predict(X_test_scaled)
print('tree_predictions ', Y_prediction_tree)
expected = Y_test
print('actual_values ', expected)
print()
print('----Tree_report--------------------------------')
print(classification_report(expected, Y_prediction_tree))
In [10]:
run_tree(X_train_scaled, X_test_scaled, Y_train, Y_test)
In [12]:
def run_random_forest(X_train_scaled, X_test_scaled, Y_train, Y_test):
"""Scikit random forest
Experiment with 'n_estimators'
"""
t0 = dt.datetime.utcnow()
n_estimators = 30
rf_model = RandomForestClassifier(n_estimators=n_estimators)
# Train
clf = clone(rf_model)
clf = rf_model.fit(X_train_scaled, Y_train)
#joblib.dump(clf, 'models/random_forest.pickle')
fp_model_pickle = os.path.expanduser('~/cltk_data/user_data/tlg_bow_fandom_forest.pickle')
joblib.dump(clf, fp_model_pickle)
scores = clf.score(X_train_scaled, Y_train)
print('... finished in {} secs.'.format(dt.datetime.utcnow() - t0))
print()
Y_prediction = clf.predict(X_test_scaled)
print('tree_predictions ', Y_prediction)
expected = Y_test
print('actual_values ', expected)
print()
print('----Random forest report--------------------------------')
print(classification_report(expected, Y_prediction))
In [13]:
run_random_forest(X_train_scaled, X_test_scaled, Y_train, Y_test)
In [ ]:
def run_svc(X_train_scaled, X_test_scaled, Y_train, Y_test):
"""Run SVC with scikit."""
# This is where we define the models with pre-defined parameters
# We can learn these parameters given our data
print('Defining and fitting SVC model ...')
t0 = dt.datetime.utcnow()
scv = svm.LinearSVC(C=100.)
scv.fit(X_train_scaled, Y_train)
fp_model_pickle = os.path.expanduser('~/cltk_data/user_data/tlg_bow_svc.pickle')
joblib.dump(scv, fp_model_pickle)
print('... finished in {} secs.'.format(dt.datetime.utcnow() - t0))
print()
Y_prediction_svc = scv.predict(X_test_scaled)
print('svc_predictions ', Y_prediction_svc)
expected = Y_test
print('actual_values ', expected)
print()
print('----SVC_report--------------------------------')
print(classification_report(expected, Y_prediction_svc))
In [ ]:
run_svc(X_train_scaled, X_test_scaled, Y_train, Y_test)
In [ ]:
def run_ada_boost(X_train_scaled, X_test_scaled, Y_train, Y_test):
"""Scikit random forest.
For plotting see:
http://scikit-learn.org/stable/auto_examples/ensemble/plot_forest_iris.html
Experiment with 'n_estimators'
"""
n_estimators = 30
ada_classifier = AdaBoostClassifier(DecisionTreeClassifier(max_depth=3),
n_estimators=n_estimators)
# Train
clf = clone(ada_classifier)
clf = ada_classifier.fit(X_train_scaled, Y_train)
fp_model_pickle = os.path.expanduser('~/cltk_data/user_data/tlg_bow_ada_boost.pickle')
joblib.dump(clf, fp_model_pickle)
scores = clf.score(X_train_scaled, Y_train)
Y_prediction = clf.predict(X_test_scaled)
print('tree_predictions ', Y_prediction)
expected = Y_test
print('actual_values ', expected)
print()
print(classification_report(expected, Y_prediction))
In [ ]:
run_ada_boost(X_train_scaled, X_test_scaled, Y_train, Y_test)